E-Commerce Retail Store Data Analysis¶
Data Pre processing¶
data <- read.csv("data.csv", encoding="ISO-8859-1")
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country |
|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 8:26 | 7.65 | 17850 | United Kingdom |
dim(data)
- 541909
- 8
length(unique(data$CustomerID))
head(data[data$Quantity < 0, ])
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 142 | C536379 | D | Discount | -1 | 12/1/2010 9:41 | 27.50 | 14527 | United Kingdom |
| 155 | C536383 | 35004C | SET OF 3 COLOURED FLYING DUCKS | -1 | 12/1/2010 9:49 | 4.65 | 15311 | United Kingdom |
| 236 | C536391 | 22556 | PLASTERS IN TIN CIRCUS PARADE | -12 | 12/1/2010 10:24 | 1.65 | 17548 | United Kingdom |
| 237 | C536391 | 21984 | PACK OF 12 PINK PAISLEY TISSUES | -24 | 12/1/2010 10:24 | 0.29 | 17548 | United Kingdom |
| 238 | C536391 | 21983 | PACK OF 12 BLUE PAISLEY TISSUES | -24 | 12/1/2010 10:24 | 0.29 | 17548 | United Kingdom |
| 239 | C536391 | 21980 | PACK OF 12 RED RETROSPOT TISSUES | -24 | 12/1/2010 10:24 | 0.29 | 17548 | United Kingdom |
str(data)
'data.frame': 541909 obs. of 8 variables: $ InvoiceNo : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ... $ StockCode : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ... $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ... $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ... $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ... $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ... $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ... $ Country : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
Missing values calculation¶
null_percentage <- colMeans(is.na(data)) * 100
null_percentage
- InvoiceNo
- 0
- StockCode
- 0
- Description
- 0
- Quantity
- 0
- InvoiceDate
- 0
- UnitPrice
- 0
- CustomerID
- 24.9266943342886
- Country
- 0
colSums(is.na(data[, sapply(data, is.numeric)]))
- Quantity
- 0
- UnitPrice
- 0
- CustomerID
- 135080
colSums(is.na(data[, sapply(data, is.factor)]))
- InvoiceNo
- 0
- StockCode
- 0
- Description
- 0
- InvoiceDate
- 0
- Country
- 0
Summary statistics¶
numeric_data <- data[, sapply(data, is.numeric)]
# Generate summary statistics for integer and numeric variables
t(summary(numeric_data))
Quantity Min. :-80995.00 1st Qu.: 1.00 Median : 3.00
UnitPrice Min. :-11062.06 1st Qu.: 1.25 Median : 2.08
CustomerID Min. :12346 1st Qu.:13953 Median :15152
Quantity Mean : 9.55 3rd Qu.: 10.00 Max. : 80995.00
UnitPrice Mean : 4.61 3rd Qu.: 4.13 Max. : 38970.00
CustomerID Mean :15288 3rd Qu.:16791 Max. :18287
Quantity
UnitPrice
CustomerID NA's :135080
categorical_data <- data[, sapply(data, is.factor)]
# Generate summary statistics for categorical variables
t(summary(categorical_data))
InvoiceNo 573585 : 1114
StockCode 85123A : 2313
Description WHITE HANGING HEART T-LIGHT HOLDER: 2369
InvoiceDate 10/31/2011 14:41: 1114
Country United Kingdom:495478
InvoiceNo 581219 : 749
StockCode 22423 : 2203
Description REGENCY CAKESTAND 3 TIER : 2200
InvoiceDate 12/8/2011 9:28 : 749
Country Germany : 9495
InvoiceNo 581492 : 731
StockCode 85099B : 2159
Description JUMBO BAG RED RETROSPOT : 2159
InvoiceDate 12/9/2011 10:03 : 731
Country France : 8557
InvoiceNo 580729 : 721
StockCode 47566 : 1727
Description PARTY BUNTING : 1727
InvoiceDate 12/5/2011 17:24 : 721
Country EIRE : 8196
InvoiceNo 558475 : 705
StockCode 20725 : 1639
Description LUNCH BAG RED RETROSPOT : 1638
InvoiceDate 6/29/2011 15:58 : 705
Country Spain : 2533
InvoiceNo 579777 : 687
StockCode 84879 : 1502
Description ASSORTED COLOUR BIRD ORNAMENT : 1501
InvoiceDate 11/30/2011 15:13: 687
Country Netherlands : 2371
InvoiceNo (Other):537202
StockCode (Other):530366
Description (Other) :530315
InvoiceDate (Other) :537202
Country (Other) : 15279
The proportions of transactions originating from each country in the dataset, providing insights into the geographical distribution of sales.¶
country_proportions <- prop.table(table(data$Country))
# Format the proportions as decimals
formatted_proportions <- format(country_proportions, scientific = FALSE)
formatted_proportions
- Australia
- '0.00232326830'
- Austria
- '0.00073997664'
- Bahrain
- '0.00003506124'
- Belgium
- '0.00381798420'
- Brazil
- '0.00005905050'
- Canada
- '0.00027864457'
- Channel Islands
- '0.00139875883'
- Cyprus
- '0.00114779419'
- Czech Republic
- '0.00005535985'
- Denmark
- '0.00071783270'
- EIRE
- '0.01512431054'
- European Community
- '0.00011256502'
- Finland
- '0.00128250315'
- France
- '0.01579047405'
- Germany
- '0.01752139197'
- Greece
- '0.00026941793'
- Hong Kong
- '0.00053145454'
- Iceland
- '0.00033584975'
- Israel
- '0.00054806250'
- Italy
- '0.00148179860'
- Japan
- '0.00066062752'
- Lebanon
- '0.00008303977'
- Lithuania
- '0.00006458649'
- Malta
- '0.00023435669'
- Netherlands
- '0.00437527334'
- Norway
- '0.00200402651'
- Poland
- '0.00062925694'
- Portugal
- '0.00280305365'
- RSA
- '0.00010702904'
- Saudi Arabia
- '0.00001845328'
- Singapore
- '0.00042258017'
- Spain
- '0.00467421652'
- Sweden
- '0.00085254166'
- Switzerland
- '0.00369434721'
- United Arab Emirates
- '0.00012548232'
- United Kingdom
- '0.91431956288'
- Unspecified
- '0.00082301641'
- USA
- '0.00053699053'
Convert "Description" to character type and remove extra whitespaces¶
data$InvoiceNo <- as.character(data$InvoiceNo)
data$Description <- as.character(data$Description)
data$Description <- trimws(data$Description)
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country |
|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 8:26 | 7.65 | 17850 | United Kingdom |
str(data)
'data.frame': 541909 obs. of 8 variables: $ InvoiceNo : chr "536365" "536365" "536365" "536365" ... $ StockCode : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ... $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ... $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ... $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ... $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ... $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ... $ Country : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...
Remove missing values¶
data <- data[!is.na(data$CustomerID), ]
colMeans(is.na(data)) * 100
- InvoiceNo
- 0
- StockCode
- 0
- Description
- 0
- Quantity
- 0
- InvoiceDate
- 0
- UnitPrice
- 0
- CustomerID
- 0
- Country
- 0
To accurately understand customer behavior, we need complete data on who the customers are. So, we're removing rows where we don't have customer IDs. This helps keep our analysis accurate and ensures we're not missing any information about the products they purchased.
Identifying and removing duplicates¶
sum(duplicated(data))
duplicates <- duplicated(data)
# Extract duplicate rows
duplicate_rows <- data[duplicates, ]
duplicate_rows
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 518 | 536409 | 21866 | UNION JACK FLAG LUGGAGE TAG | 1 | 12/1/2010 11:45 | 1.25 | 17908 | United Kingdom |
| 528 | 536409 | 22866 | HAND WARMER SCOTTY DOG DESIGN | 1 | 12/1/2010 11:45 | 2.10 | 17908 | United Kingdom |
| 538 | 536409 | 22900 | SET 2 TEA TOWELS I LOVE LONDON | 1 | 12/1/2010 11:45 | 2.95 | 17908 | United Kingdom |
| 540 | 536409 | 22111 | SCOTTIE DOG HOT WATER BOTTLE | 1 | 12/1/2010 11:45 | 4.95 | 17908 | United Kingdom |
| 556 | 536412 | 22327 | ROUND SNACK BOXES SET OF 4 SKULLS | 1 | 12/1/2010 11:49 | 2.95 | 17920 | United Kingdom |
| 588 | 536412 | 22273 | FELTCRAFT DOLL MOLLY | 1 | 12/1/2010 11:49 | 2.95 | 17920 | United Kingdom |
| 590 | 536412 | 22749 | FELTCRAFT PRINCESS CHARLOTTE DOLL | 1 | 12/1/2010 11:49 | 3.75 | 17920 | United Kingdom |
| 595 | 536412 | 22141 | CHRISTMAS CRAFT TREE TOP ANGEL | 1 | 12/1/2010 11:49 | 2.10 | 17920 | United Kingdom |
| 599 | 536412 | 21448 | 12 DAISY PEGS IN WOOD BOX | 1 | 12/1/2010 11:49 | 1.65 | 17920 | United Kingdom |
| 601 | 536412 | 22569 | FELTCRAFT CUSHION BUTTERFLY | 2 | 12/1/2010 11:49 | 3.75 | 17920 | United Kingdom |
| 602 | 536412 | 21448 | 12 DAISY PEGS IN WOOD BOX | 2 | 12/1/2010 11:49 | 1.65 | 17920 | United Kingdom |
| 605 | 536412 | 21448 | 12 DAISY PEGS IN WOOD BOX | 2 | 12/1/2010 11:49 | 1.65 | 17920 | United Kingdom |
| 606 | 536412 | 22902 | TOTE BAG I LOVE LONDON | 7 | 12/1/2010 11:49 | 2.10 | 17920 | United Kingdom |
| 617 | 536412 | 21708 | FOLDING UMBRELLA CREAM POLKADOT | 1 | 12/1/2010 11:49 | 4.95 | 17920 | United Kingdom |
| 618 | 536412 | 22900 | SET 2 TEA TOWELS I LOVE LONDON | 2 | 12/1/2010 11:49 | 2.95 | 17920 | United Kingdom |
| 619 | 536412 | 21706 | FOLDING UMBRELLA RED/WHITE POLKADOT | 1 | 12/1/2010 11:49 | 4.95 | 17920 | United Kingdom |
| 621 | 536412 | 85184C | S/4 VALENTINE DECOUPAGE HEART BOX | 1 | 12/1/2010 11:49 | 2.95 | 17920 | United Kingdom |
| 759 | 536446 | 21651 | HANGING GLASS ETCHED TEALIGHT | 6 | 12/1/2010 12:15 | 1.65 | 15983 | United Kingdom |
| 832 | 536464 | 22866 | HAND WARMER SCOTTY DOG DESIGN | 1 | 12/1/2010 12:23 | 2.10 | 17968 | United Kingdom |
| 836 | 536464 | 22945 | CHRISTMAS METAL TAGS ASSORTED | 6 | 12/1/2010 12:23 | 0.85 | 17968 | United Kingdom |
| 853 | 536464 | 21992 | VINTAGE PAISLEY STATIONERY SET | 1 | 12/1/2010 12:23 | 2.95 | 17968 | United Kingdom |
| 860 | 536464 | 22866 | HAND WARMER SCOTTY DOG DESIGN | 1 | 12/1/2010 12:23 | 2.10 | 17968 | United Kingdom |
| 892 | 536488 | 84347 | ROTATING SILVER ANGELS T-LIGHT HLDR | 1 | 12/1/2010 12:31 | 2.55 | 17897 | United Kingdom |
| 1062 | 536522 | 21121 | SET/10 RED POLKADOT PARTY CANDLES | 1 | 12/1/2010 12:49 | 1.25 | 15012 | United Kingdom |
| 1068 | 536522 | 21122 | SET/10 PINK POLKADOT PARTY CANDLES | 1 | 12/1/2010 12:49 | 1.25 | 15012 | United Kingdom |
| 1141 | 536528 | 22865 | HAND WARMER OWL DESIGN | 1 | 12/1/2010 13:17 | 2.10 | 15525 | United Kingdom |
| 1154 | 536528 | 85114B | IVORY ENCHANTED FOREST PLACEMAT | 1 | 12/1/2010 13:17 | 1.65 | 15525 | United Kingdom |
| 1160 | 536528 | 22584 | PACK OF 6 PANNETONE GIFT BOXES | 2 | 12/1/2010 13:17 | 2.55 | 15525 | United Kingdom |
| 1166 | 536528 | 22911 | PAPER CHAIN KIT LONDON | 1 | 12/1/2010 13:17 | 2.95 | 15525 | United Kingdom |
| 1167 | 536528 | 22743 | MAKE YOUR OWN FLOWERPOWER CARD KIT | 1 | 12/1/2010 13:17 | 2.95 | 15525 | United Kingdom |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 537818 | 581352 | 22481 | BLACK TEA TOWEL CLASSIC DESIGN | 2 | 12/8/2011 12:26 | 0.39 | 14698 | United Kingdom |
| 537820 | 581352 | 82494L | WOODEN FRAME ANTIQUE WHITE | 1 | 12/8/2011 12:26 | 2.95 | 14698 | United Kingdom |
| 537825 | 581352 | 22564 | ALPHABET STENCIL CRAFT | 1 | 12/8/2011 12:26 | 1.25 | 14698 | United Kingdom |
| 538295 | 581404 | 22469 | HEART OF WICKER SMALL | 2 | 12/8/2011 13:47 | 1.65 | 13680 | United Kingdom |
| 538377 | 581405 | 22500 | SET OF 2 TINS JARDIN DE PROVENCE | 1 | 12/8/2011 13:50 | 1.25 | 13521 | United Kingdom |
| 538467 | 581405 | 20975 | 12 PENCILS SMALL TUBE RED RETROSPOT | 1 | 12/8/2011 13:50 | 0.65 | 13521 | United Kingdom |
| 538495 | 581405 | 23212 | HEART WREATH DECORATION WITH BELL | 1 | 12/8/2011 13:50 | 1.25 | 13521 | United Kingdom |
| 538627 | 581412 | 21481 | FAWN BLUE HOT WATER BOTTLE | 1 | 12/8/2011 14:38 | 3.75 | 14415 | United Kingdom |
| 538678 | 581412 | 22199 | FRYING PAN RED RETROSPOT | 1 | 12/8/2011 14:38 | 1.25 | 14415 | United Kingdom |
| 538752 | 581414 | 22326 | ROUND SNACK BOXES SET OF4 WOODLAND | 1 | 12/8/2011 14:39 | 2.95 | 14730 | United Kingdom |
| 538757 | 581414 | 22094 | RED RETROSPOT TISSUE BOX | 1 | 12/8/2011 14:39 | 0.39 | 14730 | United Kingdom |
| 538763 | 581414 | 22379 | RECYCLING BAG RETROSPOT | 1 | 12/8/2011 14:39 | 2.10 | 14730 | United Kingdom |
| 538772 | 581414 | 23291 | DOLLY GIRL CHILDRENS CUP | 1 | 12/8/2011 14:39 | 1.25 | 14730 | United Kingdom |
| 538777 | 581414 | 22327 | ROUND SNACK BOXES SET OF 4 SKULLS | 1 | 12/8/2011 14:39 | 2.95 | 14730 | United Kingdom |
| 538782 | 581414 | 22327 | ROUND SNACK BOXES SET OF 4 SKULLS | 1 | 12/8/2011 14:39 | 2.95 | 14730 | United Kingdom |
| 538786 | 581414 | 23454 | THREE MINI HANGING FRAMES | 1 | 12/8/2011 14:39 | 4.15 | 14730 | United Kingdom |
| 538943 | 581425 | 85152 | HAND OVER THE CHOCOLATE SIGN | 12 | 12/8/2011 15:31 | 2.10 | 14796 | United Kingdom |
| 538956 | 581425 | 21231 | SWEETHEART CERAMIC TRINKET BOX | 4 | 12/8/2011 15:31 | 1.25 | 14796 | United Kingdom |
| 538971 | 581425 | 23178 | JAM CLOCK MAGNET | 1 | 12/8/2011 15:31 | 1.25 | 14796 | United Kingdom |
| 539893 | 581449 | 22423 | REGENCY CAKESTAND 3 TIER | 1 | 12/8/2011 17:37 | 12.75 | 12748 | United Kingdom |
| 539927 | 581450 | 22118 | JOY WOODEN BLOCK LETTERS | 1 | 12/8/2011 17:54 | 1.25 | 16794 | United Kingdom |
| 540033 | 581456 | 35964 | FOLKART CLIP ON STARS | 2 | 12/8/2011 18:42 | 0.39 | 17530 | United Kingdom |
| 540199 | 581471 | 21411 | GINGHAM HEART DOORSTOP RED | 2 | 12/8/2011 19:29 | 1.95 | 14702 | United Kingdom |
| 541612 | 581514 | 22075 | 6 RIBBONS ELEGANT CHRISTMAS | 24 | 12/9/2011 11:20 | 0.39 | 17754 | United Kingdom |
| 541656 | 581538 | 23275 | SET OF 3 HANGING OWLS OLLIE BEAK | 1 | 12/9/2011 11:34 | 1.25 | 14446 | United Kingdom |
| 541676 | 581538 | 22068 | BLACK PIRATE TREASURE CHEST | 1 | 12/9/2011 11:34 | 0.39 | 14446 | United Kingdom |
| 541690 | 581538 | 23318 | BOX OF 6 MINI VINTAGE CRACKERS | 1 | 12/9/2011 11:34 | 2.49 | 14446 | United Kingdom |
| 541693 | 581538 | 22992 | REVOLVER WOODEN RULER | 1 | 12/9/2011 11:34 | 1.95 | 14446 | United Kingdom |
| 541700 | 581538 | 22694 | WICKER STAR | 1 | 12/9/2011 11:34 | 2.10 | 14446 | United Kingdom |
| 541702 | 581538 | 23343 | JUMBO BAG VINTAGE CHRISTMAS | 1 | 12/9/2011 11:34 | 2.08 | 14446 | United Kingdom |
data <- data[!duplicated(data), ]
# Reset row names
row.names(data) <- NULL
# Print the data frame after removing duplicates
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country |
|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 8:26 | 7.65 | 17850 | United Kingdom |
dim(data)
- 401604
- 8
The presence of entirely identical rows, including identical transaction times, implies potential data recording errors rather than genuine repeated transactions. Removing these duplicate rows will enhance dataset cleanliness, improving accuracy in customer clustering based on purchasing behaviors
length(unique(data$CustomerID))
Initial observation of the transaction dataset revealed negative values in the "Quantity" column, suggesting returned items or cancelled orders. These entries seem to correlate with invoice numbers beginning with the letter 'C', possibly indicating cancellations
data$is_Cancelled <- startsWith(data$InvoiceNo, "C")
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled |
|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom | FALSE |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom | FALSE |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 8:26 | 7.65 | 17850 | United Kingdom | FALSE |
prop.table(table(data$is_Cancelled))
FALSE TRUE
0.97790859 0.02209141
format(prop.table(table(data$is_Cancelled)), scientific=FALSE)
- FALSE
- '0.97790859'
- TRUE
- '0.02209141'
The percentage of cancelled transactions in the dataset is: 2.20%
cancelled_data <- data[data$is_Cancelled, ]
# Remove the "CustomerID" column
cancelled_data <- cancelled_data[, !names(cancelled_data) %in% "CustomerID"]
# Generate summary statistics for the filtered data frame
summary_data <- summary(cancelled_data)
# Transpose the summary statistics
summary_data_transposed <- t(summary_data)
# Print the transposed summary statistics
print(summary_data_transposed)
InvoiceNo Length:8872 Class :character
StockCode 22423 : 180 M : 175
Description Length:8872 Class :character
Quantity Min. :-80995.00 1st Qu.: -6.00
InvoiceDate 10/12/2011 16:17: 101 7/19/2011 12:26 : 57
UnitPrice Min. : 0.01 1st Qu.: 1.45
Country United Kingdom:7501 Germany : 453
is_Cancelled Mode:logical TRUE:8872
InvoiceNo Mode :character
StockCode POST : 97 22960 : 86
Description Mode :character
Quantity Median : -2.00 Mean : -30.77
InvoiceDate 3/31/2011 11:58 : 45 7/21/2011 13:00 : 40
UnitPrice Median : 2.95 Mean : 18.90
Country EIRE : 247 France : 148
is_Cancelled
InvoiceNo
StockCode D : 77 22720 : 72
Description
Quantity 3rd Qu.: -1.00 Max. : -1.00
InvoiceDate 10/12/2011 13:15: 39 10/6/2011 19:51 : 36
UnitPrice 3rd Qu.: 4.95 Max. :38970.00
Country USA : 112 Australia : 74
is_Cancelled
InvoiceNo
StockCode (Other):8185
Description
Quantity
InvoiceDate (Other) :8554
UnitPrice
Country (Other) : 337
is_Cancelled
Insights from the cancelled transactions data reveal all quantities as negative, indicating cancellations. The UnitPrice column shows a varied range of products involved. Retaining cancelled transactions allows for deeper analysis, potentially enhancing clustering and improving recommendation systems by understanding cancellation patterns
Exploratory Data Analysis¶
library(ggplot2)
# Calculate the interquartile range (IQR) for the Quantity column
Q1 <- quantile(data$Quantity, 0.02)
Q3 <- quantile(data$Quantity, 0.98)
IQR <- Q3 - Q1
# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter out rows with Quantity values outside the bounds
data_filtered <- data[data$Quantity >= lower_bound & data$Quantity <= upper_bound, ]
# Create a histogram of Quantity with the filtered data
ggplot(data_filtered, aes(x = Quantity)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Quantity (Filtered)", x = "Quantity", y = "Frequency")
library(ggplot2)
# Create a box plot of the "Quantity" column
ggplot(data, aes(y = Quantity)) +
geom_boxplot(notch = TRUE) +
labs(title = "Box Plot of Quantity")
# Filter rows where is_Cancelled is FALSE
filtered_data <- data[!data$is_Cancelled, ]
# Remove the "CustomerID" column
filtered_data <- subset(filtered_data, select = -CustomerID)
# Transpose the summary statistics
summary_data_transposed <- t(summary(filtered_data))
# Print the transposed summary statistics
print(summary_data_transposed)
InvoiceNo Length:392732 Class :character
StockCode 85123A : 2023 22423 : 1714
Description Length:392732 Class :character
Quantity Min. : 1.00 1st Qu.: 2.00
InvoiceDate 11/14/2011 15:27: 542 11/28/2011 15:54: 533
UnitPrice Min. : 0.000 1st Qu.: 1.250
Country United Kingdom:349227 Germany : 9027
is_Cancelled Mode :logical FALSE:392732
InvoiceNo Mode :character
StockCode 85099B : 1615 84879 : 1395
Description Mode :character
Quantity Median : 6.00 Mean : 13.15
InvoiceDate 12/5/2011 17:17 : 529 11/23/2011 13:39: 443
UnitPrice Median : 1.950 Mean : 3.126
Country France : 8327 EIRE : 7228
is_Cancelled
InvoiceNo
StockCode 47566 : 1390 20725 : 1304
Description
Quantity 3rd Qu.: 12.00 Max. :80995.00
InvoiceDate 10/31/2011 14:09: 435 9/21/2011 14:40 : 421
UnitPrice 3rd Qu.: 3.750 Max. :8142.750
Country Spain : 2480 Netherlands : 2363
is_Cancelled
InvoiceNo
StockCode (Other):383291
Description
Quantity
InvoiceDate (Other) :389829
UnitPrice
Country (Other) : 14080
is_Cancelled
non_cancelled_data <- subset(data, !is_Cancelled)
# Create a box plot of the "Quantity" column
ggplot(non_cancelled_data, aes(y = Quantity)) +
geom_boxplot(notch = TRUE) +
labs(title = "Box Plot of Quantity")
The box plot analysis of Quantity indicates a right-skewed distribution with a median suggesting most orders consist of small quantities. While the majority fall within a narrow interquartile range (IQR), outliers representing extremely large orders are present, possibly indicating bulk purchases or data anomalies. Overall, the platform tends to handle small to moderate quantity orders, with rare outliers potentially offering insights into product success or data integrity issues.ordering system.
# Load the ggplot2 and scales packages
library(ggplot2)
library(scales)
# Filter the data frame to include only non-cancelled orders
non_cancelled_data <- subset(data, !is_Cancelled)
# Create a box plot of the "Quantity" column with logarithmic scale on the y-axis
ggplot(non_cancelled_data, aes(y = Quantity)) +
geom_boxplot(notch = TRUE) +
scale_y_continuous(trans = "log10") +
labs(title = "Box Plot of Quantity")
Feature engineering¶
length(unique(data$StockCode))
# Load the stringr package
library(stringr)
# Create a new column "len_StockCode" to store the length of trimmed "StockCode" values
data$len_StockCode <- nchar(str_trim(data$StockCode))
# Print the modified data frame
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode |
|---|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850 | United Kingdom | FALSE | 6 |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE | 5 |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850 | United Kingdom | FALSE | 6 |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE | 6 |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850 | United Kingdom | FALSE | 6 |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 12/1/2010 8:26 | 7.65 | 17850 | United Kingdom | FALSE | 5 |
format(prop.table(table(data$len_StockCode)), scientific=FALSE)
- 1
- '0.00133713808'
- 2
- '0.00033366202'
- 3
- '0.00003984024'
- 4
- '0.00302785829'
- 5
- '0.91179121722'
- 6
- '0.08270335953'
- 7
- '0.00073704445'
- 12
- '0.00002988018'
The majority of stock codes adhere to a standard format of five numeric characters, while anomalies such as non-numeric and single-digit codes are present but rare. These anomalies, comprising only 0.48% of the dataset, likely represent non-product transactions and their inclusion may introduce noise in subsequent analyses. To ensure focus on genuine product transactions and improve accuracy, it's recommended to filter out rows with anomalous stock codes before further analysis and model development.
num_unique_stockcodes <- length(unique(data[data$len_StockCode == 5, "StockCode"]))
# Print the number of unique stock codes
print(num_unique_stockcodes)
[1] 2798
filtered_data <- data[data$len_StockCode < 5, ]
# Print the filtered data frame
filtered_data
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | |
|---|---|---|---|---|---|---|---|---|---|---|
| 46 | 536370 | POST | POSTAGE | 3 | 12/1/2010 8:45 | 18.00 | 12583 | France | FALSE | 4 |
| 142 | C536379 | D | Discount | -1 | 12/1/2010 9:41 | 27.50 | 14527 | United Kingdom | TRUE | 1 |
| 387 | 536403 | POST | POSTAGE | 1 | 12/1/2010 11:27 | 15.00 | 12791 | Netherlands | FALSE | 4 |
| 1098 | 536527 | POST | POSTAGE | 1 | 12/1/2010 13:04 | 18.00 | 12662 | Germany | FALSE | 4 |
| 1387 | 536540 | C2 | CARRIAGE | 1 | 12/1/2010 14:05 | 50.00 | 14911 | EIRE | FALSE | 2 |
| 1655 | 536569 | M | Manual | 1 | 12/1/2010 15:35 | 1.25 | 16274 | United Kingdom | FALSE | 1 |
| 1666 | 536569 | M | Manual | 1 | 12/1/2010 15:35 | 18.95 | 16274 | United Kingdom | FALSE | 1 |
| 3790 | 536840 | POST | POSTAGE | 1 | 12/2/2010 18:27 | 18.00 | 12738 | Germany | FALSE | 4 |
| 3969 | 536852 | POST | POSTAGE | 1 | 12/3/2010 9:51 | 18.00 | 12686 | France | FALSE | 4 |
| 4036 | 536858 | POST | POSTAGE | 2 | 12/3/2010 10:36 | 40.00 | 13520 | Switzerland | FALSE | 4 |
| 4080 | 536861 | POST | POSTAGE | 3 | 12/3/2010 10:44 | 18.00 | 12427 | Germany | FALSE | 4 |
| 4411 | 536967 | POST | POSTAGE | 1 | 12/3/2010 12:57 | 18.00 | 12600 | Germany | FALSE | 4 |
| 4485 | 536974 | POST | POSTAGE | 2 | 12/3/2010 13:59 | 18.00 | 12682 | France | FALSE | 4 |
| 4607 | 536981 | M | Manual | 2 | 12/3/2010 14:26 | 0.85 | 14723 | United Kingdom | FALSE | 1 |
| 4636 | 536983 | POST | POSTAGE | 1 | 12/3/2010 14:30 | 18.00 | 12712 | Germany | FALSE | 4 |
| 4802 | 536990 | POST | POSTAGE | 1 | 12/3/2010 15:14 | 28.00 | 12793 | Portugal | FALSE | 4 |
| 4865 | 537022 | POST | POSTAGE | 2 | 12/3/2010 15:45 | 28.00 | 12725 | Italy | FALSE | 4 |
| 4903 | 537026 | POST | POSTAGE | 2 | 12/3/2010 16:35 | 18.00 | 12395 | Belgium | FALSE | 4 |
| 5550 | 537065 | POST | POSTAGE | 9 | 12/5/2010 11:57 | 18.00 | 12567 | France | FALSE | 4 |
| 5573 | 537077 | M | Manual | 12 | 12/5/2010 11:59 | 0.42 | 17062 | United Kingdom | FALSE | 1 |
| 6116 | 537137 | M | Manual | 36 | 12/5/2010 12:43 | 0.85 | 16327 | United Kingdom | FALSE | 1 |
| 6173 | 537140 | M | Manual | 1 | 12/5/2010 12:53 | 0.42 | 12748 | United Kingdom | FALSE | 1 |
| 6174 | 537140 | M | Manual | 1 | 12/5/2010 12:53 | 0.85 | 12748 | United Kingdom | FALSE | 1 |
| 6597 | C537164 | D | Discount | -1 | 12/5/2010 13:21 | 29.29 | 14527 | United Kingdom | TRUE | 1 |
| 6841 | 537197 | POST | POSTAGE | 3 | 12/5/2010 14:02 | 18.00 | 12647 | Germany | FALSE | 4 |
| 6846 | 537198 | POST | POSTAGE | 1 | 12/5/2010 14:03 | 18.00 | 12647 | Germany | FALSE | 4 |
| 6866 | 537199 | M | Manual | 1 | 12/5/2010 14:06 | 0.65 | 15894 | United Kingdom | FALSE | 1 |
| 6977 | 537201 | POST | POSTAGE | 6 | 12/5/2010 14:19 | 18.00 | 12472 | Germany | FALSE | 4 |
| 7105 | 537208 | M | Manual | 4 | 12/5/2010 15:12 | 0.85 | 15889 | United Kingdom | FALSE | 1 |
| 7257 | 537212 | POST | POSTAGE | 4 | 12/5/2010 15:21 | 18.00 | 12720 | Germany | FALSE | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393745 | 580736 | POST | POSTAGE | 2 | 12/6/2011 8:55 | 18.00 | 12716 | France | FALSE | 4 |
| 393855 | 580752 | POST | POSTAGE | 3 | 12/6/2011 9:56 | 45.00 | 12478 | Greece | FALSE | 4 |
| 393922 | 580753 | POST | POSTAGE | 3 | 12/6/2011 10:00 | 18.00 | 12682 | France | FALSE | 4 |
| 393929 | 580756 | POST | POSTAGE | 1 | 12/6/2011 10:19 | 18.00 | 12723 | France | FALSE | 4 |
| 394903 | 580884 | M | Manual | 1 | 12/6/2011 12:21 | 0.85 | 15907 | United Kingdom | FALSE | 1 |
| 395434 | 580955 | POST | POSTAGE | 1 | 12/6/2011 14:22 | 28.00 | 12442 | Spain | FALSE | 4 |
| 395435 | 580956 | M | Manual | 4 | 12/6/2011 14:23 | 1.25 | 17841 | United Kingdom | FALSE | 1 |
| 395619 | C580957 | POST | POSTAGE | -1 | 12/6/2011 14:23 | 4.50 | 12839 | United Kingdom | TRUE | 4 |
| 395773 | 580965 | POST | POSTAGE | 3 | 12/6/2011 14:52 | 15.00 | 12417 | Belgium | FALSE | 4 |
| 395919 | 580979 | POST | POSTAGE | 3 | 12/6/2011 15:40 | 18.00 | 12362 | Belgium | FALSE | 4 |
| 396074 | 580986 | POST | POSTAGE | 4 | 12/6/2011 16:34 | 18.00 | 12650 | France | FALSE | 4 |
| 396276 | 581000 | POST | POSTAGE | 5 | 12/7/2011 8:03 | 18.00 | 12720 | Germany | FALSE | 4 |
| 396288 | 581001 | POST | POSTAGE | 3 | 12/7/2011 8:07 | 18.00 | 12583 | France | FALSE | 4 |
| 396441 | C581009 | M | Manual | -1 | 12/7/2011 9:15 | 125.00 | 16971 | United Kingdom | TRUE | 1 |
| 397436 | C581145 | M | Manual | -1 | 12/7/2011 13:48 | 9.95 | 17490 | United Kingdom | TRUE | 1 |
| 397917 | 581171 | POST | POSTAGE | 2 | 12/7/2011 15:02 | 18.00 | 12615 | France | FALSE | 4 |
| 398114 | 581179 | POST | POSTAGE | 1 | 12/7/2011 15:43 | 240.00 | 12471 | Germany | FALSE | 4 |
| 398159 | 581182 | POST | POSTAGE | 4 | 12/7/2011 15:56 | 28.00 | 12783 | Portugal | FALSE | 4 |
| 398170 | 581183 | POST | POSTAGE | 4 | 12/7/2011 16:24 | 18.00 | 12569 | Germany | FALSE | 4 |
| 398174 | 581184 | POST | POSTAGE | 2 | 12/7/2011 16:24 | 18.00 | 12569 | Germany | FALSE | 4 |
| 398627 | 581232 | POST | POSTAGE | 4 | 12/8/2011 10:26 | 40.00 | 12358 | Austria | FALSE | 4 |
| 398846 | 581266 | POST | POSTAGE | 5 | 12/8/2011 11:25 | 18.00 | 12621 | Germany | FALSE | 4 |
| 398849 | 581279 | POST | POSTAGE | 3 | 12/8/2011 11:35 | 18.00 | 12437 | France | FALSE | 4 |
| 399699 | 581405 | M | Manual | 3 | 12/8/2011 13:50 | 0.42 | 13521 | United Kingdom | FALSE | 1 |
| 401178 | 581493 | POST | POSTAGE | 1 | 12/9/2011 10:10 | 15.00 | 12423 | Belgium | FALSE | 4 |
| 401196 | 581494 | POST | POSTAGE | 2 | 12/9/2011 10:13 | 18.00 | 12518 | Germany | FALSE | 4 |
| 401244 | C581499 | M | Manual | -1 | 12/9/2011 10:28 | 224.69 | 15498 | United Kingdom | TRUE | 1 |
| 401426 | 581570 | POST | POSTAGE | 1 | 12/9/2011 11:59 | 18.00 | 12662 | Germany | FALSE | 4 |
| 401463 | 581574 | POST | POSTAGE | 2 | 12/9/2011 12:09 | 18.00 | 12526 | Germany | FALSE | 4 |
| 401464 | 581578 | POST | POSTAGE | 3 | 12/9/2011 12:16 | 18.00 | 12713 | Germany | FALSE | 4 |
dim(filtered_data)
- 1903
- 10
filtered_data <- subset(data, len_StockCode < 5)
# Calculate the frequency of each unique value in the "StockCode" column and normalize
value_counts <- format(prop.table(table(filtered_data$StockCode)), scientific=FALSE)
value_counts_dict <- as.list(value_counts)
# Filter out values whose counts are greater than 0
filtered_counts_dict <- value_counts_dict[value_counts_dict > 0]
# Print the filtered dictionary
print(filtered_counts_dict)
$C2 [1] "0.070415134" $CRUK [1] "0.008407777" $D [1] "0.040462428" $DOT [1] "0.008407777" $M [1] "0.241723594" $PADS [1] "0.002101944" $POST [1] "0.628481345"
filtered_data <- subset(data, len_StockCode > 7)
# Calculate the frequency of each unique value in the "StockCode" column and normalize
value_counts <- table(filtered_data$StockCode)
value_counts_dict <- as.list(value_counts)
# Filter out values whose counts are greater than 0
filtered_counts_dict <- value_counts_dict[value_counts_dict > 0]
# Print the filtered dictionary
print(filtered_counts_dict)
$`BANK CHARGES` [1] 12
filtered_data <- subset(data, len_StockCode >= 5 & len_StockCode < 8)
# Calculate the frequency of each unique value in the "StockCode" column and normalize
print(length(unique(filtered_data$StockCode)))
[1] 3676
dim(filtered_data)
- 399689
- 10
dim(data)
- 401604
- 10
# Total number of records
total_records <- nrow(data)
# Number of records with anomalous stock codes
anomalous_records <- total_records - nrow(filtered_data)
# Calculate the percentage of records with anomalous stock codes
percentage_anomalous <- (anomalous_records / total_records) * 100
# Print the percentage
print(paste("The percentage of records with anomalous stock codes in the dataset is:", round(percentage_anomalous, 2), "%"))
[1] "The percentage of records with anomalous stock codes in the dataset is: 0.48 %"
Insights:
A majority of the unique stock codes (3676 out of 3684) contain exactly 5 numeric characters, which seems to be the standard format for representing product codes in this datase.
There are a few anomalies: 7 stock codes contain no numeric characters and 1 stock code contains only 1 numeric charactns.
Based on the ana the these anomalous codes are just a fraction among all unique stock codes (only 8 out of 3684). These codes seem to represent non-product transactions like "BANK CHARGES", "POST" (possibly postage fees), etc. Since they do not represent actual products and are a very small proportion of the dataset, including them in the analysis might introduce noise and distort the clustering and recommendation We have decided told be to filter out and remove rows with these anomalous stock codes from the dataset before proceeding with further nalysis.
data <- data[data$len_StockCode >= 5 & data$len_StockCode < 8, ]
dim(data)
- 399689
- 10
Zero unit prices may indicate free items or data entry errors. To understand their nature, it is essential to investigate these zero unit price transactions further.¶
boxplot(data$UnitPrice, notch = TRUE, main = "Box Plot of UnitPrice", ylab = "UnitPrice")
The box plot reveals a right-skewed distribution of UnitPrice with many outliers at the upper end, suggesting significant variability. Applying a logarithmic scale addresses this by compressing the high-priced outliers and improving visualization, aiding in identifying patterns and trends. This transformation enhances interpretability and facilitates analysis of e-commerce pricing data spanning multiple orders of magnitude. analysis.ization.ploration.
filtered_data <- subset(data, UnitPrice > 0)
# Disable scientific notation for numeric output
options(scipen = 999)
# Create a box plot of the filtered "UnitPrice" column with logarithmic scale on y-axis
boxplot(filtered_data$UnitPrice, notch = TRUE, main = "Box Plot of UnitPrice", ylab = "UnitPrice", log = "y")
The unit prices exhibit a right-skewed distribution, with most transactions involving lower-priced items and a few high-value outliers. The interquartile range suggests that half of the unit prices fall within a relatively narrow range. Outliers at the higher end, potentially representing luxury items, warrant further investigation for potential data errors.r electronics.
table(data$is_Cancelled)
FALSE TRUE 391183 8506
dim(data)
- 399689
- 10
summary(data$UnitPrice)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.000 1.250 1.950 2.908 3.750 649.500
zero_price_data <- data[data$UnitPrice == 0, ]
# Summary statistics for the "Quantity" column of the filtered data
summary(zero_price_data$Quantity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 2.0 11.0 420.5 36.0 12540.0
The 33 transactions with zero unit price display significant variability in quantity, ranging from 1 to 12540 items with a notable standard deviation. Given the small number of these transactions, excluding them from the dataset is advisable to ensure the accuracy and reliability of the clustering model and recommendation.stem.
zero_price_data <- data[data$UnitPrice == 0, ]
# Display the filtered data
head(zero_price_data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6843 | 537197 | 22841 | ROUND CAKE TIN VINTAGE GREEN | 1 | 12/5/2010 14:02 | 0 | 12647 | Germany | FALSE | 5 |
| 22620 | 539263 | 22580 | ADVENT CALENDAR GINGHAM SACK | 4 | 12/16/2010 14:36 | 0 | 16560 | United Kingdom | FALSE | 5 |
| 25552 | 539722 | 22423 | REGENCY CAKESTAND 3 TIER | 10 | 12/21/2010 13:45 | 0 | 14911 | EIRE | FALSE | 5 |
| 29375 | 540372 | 22090 | PAPER BUNTING RETROSPOT | 24 | 1/6/2011 16:41 | 0 | 13081 | United Kingdom | FALSE | 5 |
| 29377 | 540372 | 22553 | PLASTERS IN TIN SKULLS | 24 | 1/6/2011 16:41 | 0 | 13081 | United Kingdom | FALSE | 5 |
| 34904 | 541109 | 22168 | ORGANISER WOOD ANTIQUE WHITE | 1 | 1/13/2011 15:10 | 0 | 15107 | United Kingdom | FALSE | 5 |
# Filter the data to remove records with unit price of zero
data <- data[data$UnitPrice > 0, ]
str(data$InvoiceDate)
Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
copied_df <- data.frame(data)
library(dplyr)
# Convert InvoiceDate to datetime with specified format
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%m/%d/%Y %H:%M")
# Set InvoiceDate as index
data <- data %>%
arrange(InvoiceDate) %>% # Ensure data is sorted by InvoiceDate
mutate(InvoiceDate = as.Date(InvoiceDate)) # Convert to Date type
# Aggregate sales data
data$Sales <- data$Quantity * data$UnitPrice
daily_sales <- data %>%
group_by(InvoiceDate) %>%
summarize(Sales = sum(Sales))
str(data$InvoiceDate)
Date[1:399656], format: "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" ...
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | Sales |
|---|---|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 | 2.55 | 17850 | United Kingdom | FALSE | 6 | 15.30 |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 5 | 20.34 |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 | 2.75 | 17850 | United Kingdom | FALSE | 6 | 22.00 |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 | 7.65 | 17850 | United Kingdom | FALSE | 5 | 15.30 |
dim(data)
- 399656
- 11
daily_sales
| InvoiceDate | Sales |
|---|---|
| 2010-12-01 | 45737.56 |
| 2010-12-02 | 42547.42 |
| 2010-12-03 | 25283.73 |
| 2010-12-05 | 30543.61 |
| 2010-12-06 | 30061.75 |
| 2010-12-07 | 53126.64 |
| 2010-12-08 | 37809.22 |
| 2010-12-09 | 35114.58 |
| 2010-12-10 | 32759.73 |
| 2010-12-12 | 17014.20 |
| 2010-12-13 | 27223.84 |
| 2010-12-14 | 26789.23 |
| 2010-12-15 | 29324.76 |
| 2010-12-16 | 43416.13 |
| 2010-12-17 | 21228.41 |
| 2010-12-18 | 33.00 |
| 2010-12-19 | 7153.35 |
| 2010-12-20 | 17334.56 |
| 2010-12-21 | 15415.71 |
| 2010-12-22 | 4776.12 |
| 2010-12-23 | 5200.37 |
| 2011-01-04 | 10858.36 |
| 2011-01-05 | 28074.22 |
| 2011-01-06 | 31704.85 |
| 2011-01-07 | 22942.80 |
| 2011-01-09 | 15334.53 |
| 2011-01-10 | 14763.70 |
| 2011-01-11 | 59042.51 |
| 2011-01-12 | 16133.55 |
| 2011-01-13 | 14782.36 |
| ... | ... |
| 2011-11-06 | 42114.91 |
| 2011-11-07 | 26475.81 |
| 2011-11-08 | 36916.09 |
| 2011-11-09 | 57275.99 |
| 2011-11-10 | 62231.47 |
| 2011-11-11 | 41158.03 |
| 2011-11-13 | 27879.22 |
| 2011-11-14 | 55826.58 |
| 2011-11-15 | 46076.27 |
| 2011-11-16 | 46215.65 |
| 2011-11-17 | 50402.72 |
| 2011-11-18 | 38015.95 |
| 2011-11-20 | 29289.83 |
| 2011-11-21 | 43374.72 |
| 2011-11-22 | 47950.01 |
| 2011-11-23 | 69390.52 |
| 2011-11-24 | 33330.62 |
| 2011-11-25 | 27200.94 |
| 2011-11-27 | 16923.39 |
| 2011-11-28 | 49077.83 |
| 2011-11-29 | 47541.59 |
| 2011-11-30 | 40094.66 |
| 2011-12-01 | 40158.47 |
| 2011-12-02 | 46765.47 |
| 2011-12-04 | 19943.60 |
| 2011-12-05 | 55332.24 |
| 2011-12-06 | 42984.87 |
| 2011-12-07 | 68347.61 |
| 2011-12-08 | 33495.22 |
| 2011-12-09 | 31124.54 |
# Load required packages
library(ggplot2)
library(tibble)
# Convert daily_sales dataframe to tibble
daily_sales <- as_tibble(daily_sales)
# Create time series plot using ggplot2
ggplot(daily_sales, aes(x = InvoiceDate, y = Sales)) +
geom_line() +
labs(title = "Daily Sales Over Time", x = "Date", y = "Sales")
Registered S3 methods overwritten by 'ggplot2': method from [.quosures rlang c.quosures rlang print.quosures rlang
library(ggplot2)
# Aggregate sales by country
sales_by_country <- aggregate(Sales ~ Country, data = data, FUN = sum)
# Create bar plot
ggplot(sales_by_country, aes(x = Country, y = Sales)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Total Sales by Country", x = "Country", y = "Total Sales") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Create scatter plot
ggplot(data, aes(x = UnitPrice, y = Quantity)) +
geom_point() +
labs(title = "Scatter Plot of Quantity vs. Unit Price", x = "Unit Price", y = "Quantity")
Time Series Analysis¶
library(lubridate)
library(dplyr)
# Load required libraries
# Convert InvoiceDate to datetime with specified format
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%m/%d/%Y %H:%M")
# Set InvoiceDate as index and convert to Date type
data <- data %>%
arrange(InvoiceDate) %>% # Ensure data is sorted by InvoiceDate
mutate(InvoiceDate = as.Date(InvoiceDate))
# Calculate sales
data$Sales <- data$Quantity * data$UnitPrice
# Aggregate sales data by month
monthly_sales <- data %>%
group_by(YearMonth = format(InvoiceDate, "%Y-%m")) %>% # Extract year-month from date
summarize(Sales = sum(Sales))
monthly_sales
| YearMonth | Sales |
|---|---|
| 2010-12 | 547893.9 |
| 2011-01 | 471430.3 |
| 2011-02 | 434018.2 |
| 2011-03 | 572153.2 |
| 2011-04 | 422112.7 |
| 2011-05 | 650314.2 |
| 2011-06 | 639609.2 |
| 2011-07 | 581234.8 |
| 2011-08 | 612605.1 |
| 2011-09 | 921808.7 |
| 2011-10 | 961042.5 |
| 2011-11 | 1113102.1 |
| 2011-12 | 338152.0 |
# Plot Monthly Sales
ggplot(monthly_sales, aes(x = YearMonth, y = Sales)) +
geom_point() +
labs(title = "Monthly Sales Over Time", x = "Month", y = "Sales")
library(lubridate)
library(dplyr)
library(plotly)
# Aggregate sales data by month
monthly_sales <- data %>%
group_by(YearMonth = format(InvoiceDate, "%Y-%m")) %>%
summarise(Sales = sum(Sales))
# Plot Monthly Sales
plotly::plot_ly(monthly_sales, x = ~YearMonth, y = ~Sales, type = "scatter", mode = "lines") %>%
plotly::layout(title = "Monthly Sales Over Time", xaxis = list(title = "Month"), yaxis = list(title = "Sales"))
# Load required libraries
library(lubridate)
library(dplyr)
library(plotly)
# Aggregate sales data by week
weekly_sales <- data %>%
group_by(Week = format(InvoiceDate, "%Y-%U")) %>%
summarise(Sales = sum(Sales))
# Plot Weekly Sales
plotly::plot_ly(weekly_sales, x = ~Week, y = ~Sales, type = "scatter", mode = "lines") %>%
plotly::layout(title = "Weekly Sales Over Time", xaxis = list(title = "Week"), yaxis = list(title = "Sales"))
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | Sales |
|---|---|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 | 2.55 | 17850 | United Kingdom | FALSE | 6 | 15.30 |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 5 | 20.34 |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 | 2.75 | 17850 | United Kingdom | FALSE | 6 | 22.00 |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 | 7.65 | 17850 | United Kingdom | FALSE | 5 | 15.30 |
# Calculate the number of unique values in the "StockCode" column
unique_stock_codes <- data %>%
distinct(StockCode) %>%
nrow()
# Print the number of unique stock codes
print(unique_stock_codes)
[1] 3676
# Calculate the number of unique values in the "CustomerID" column
unique_customer_ids <- data %>%
distinct(CustomerID) %>%
nrow()
# Print the number of unique customer IDs
print(unique_customer_ids)
[1] 4362
head(data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | Sales |
|---|---|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 | 2.55 | 17850 | United Kingdom | FALSE | 6 | 15.30 |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 5 | 20.34 |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 | 2.75 | 17850 | United Kingdom | FALSE | 6 | 22.00 |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 | 7.65 | 17850 | United Kingdom | FALSE | 5 | 15.30 |
Pareto Principle Analysis¶
Implementing Pareto's 80-20 rule involves verifying if 20% of customers generate 80% of the revenue and if 20% of products account for 80% of sales volume or revenue. This analysis aims to identify significant contributors to revenue concentration and product popularity in the dataset, aligning with the Pareto principle's concept of the "vital few" driving majority outcomes.
# Step 1: Calculate Total Value per Key
value_per_key <- data %>%
group_by(CustomerID) %>%
summarise(Total_Value = sum(Sales))
head(value_per_key)
| CustomerID | Total_Value |
|---|---|
| 12346 | 0.00 |
| 12347 | 4310.00 |
| 12348 | 1437.24 |
| 12349 | 1457.55 |
| 12350 | 294.40 |
| 12352 | 1265.41 |
value_per_key_sorted <- value_per_key %>%
arrange(desc(Total_Value))
head(value_per_key_sorted)
| CustomerID | Total_Value |
|---|---|
| 14646 | 278778.0 |
| 18102 | 259657.3 |
| 17450 | 189575.5 |
| 14911 | 128768.2 |
| 12415 | 123638.2 |
| 14156 | 113685.8 |
value_per_key_sorted$Cumulative_Percentage <- cumsum(value_per_key_sorted$Total_Value) / sum(value_per_key_sorted$Total_Value) * 100
head(value_per_key_sorted)
| CustomerID | Total_Value | Cumulative_Percentage |
|---|---|---|
| 14646 | 278778.0 | 3.372800 |
| 18102 | 259657.3 | 6.514268 |
| 17450 | 189575.5 | 8.807850 |
| 14911 | 128768.2 | 10.365755 |
| 12415 | 123638.2 | 11.861593 |
| 14156 | 113685.8 | 13.237022 |
# Calculate the percentage of customers contributing to 80% of sales
perc <- round(sum(value_per_key_sorted$Cumulative_Percentage < 80) / nrow(value_per_key_sorted) * 100)
# Print the result
print(paste0(perc, "% of Customers contribute to 80% of sales."))
[1] "27% of Customers contribute to 80% of sales."
27% of Customers contribute to 80% of sales.¶
# Calculate the number of rows where Cumulative_Percentage < 80
num_rows <- nrow(value_per_key_sorted[value_per_key_sorted$Cumulative_Percentage < 80, ])
# Print the result
print(num_rows)
[1] 1163
# Step 1: Calculate Total Value per Key
value_per_key_stock <- data %>%
group_by(StockCode) %>%
summarise(Total_Value = sum(Sales))
value_per_key_stock_sorted <- value_per_key_stock %>%
arrange(desc(Total_Value))
value_per_key_stock_sorted$Cumulative_Percentage <- cumsum(value_per_key_stock_sorted$Total_Value) / sum(value_per_key_stock_sorted$Total_Value) * 100
head(value_per_key_stock_sorted)
| StockCode | Total_Value | Cumulative_Percentage |
|---|---|---|
| 22423 | 132567.70 | 1.603872 |
| 85123A | 93923.15 | 2.740203 |
| 85099B | 83056.52 | 3.745064 |
| 47566 | 67628.43 | 4.563267 |
| 84879 | 56331.91 | 5.244800 |
| 23084 | 51042.84 | 5.862342 |
# Calculate the percentage of products contributing to 80% of sales
perc <- round(sum(value_per_key_stock_sorted$Cumulative_Percentage < 80) / nrow(value_per_key_stock_sorted) * 100)
# Print the result
print(paste0(perc, "% of Products contribute to 80% of sales."))
[1] "22% of Products contribute to 80% of sales."
22% of Products contribute to 80% of sales.¶
# Calculate the number of rows where Cumulative_Percentage < 80
num_rows <- nrow(value_per_key_stock_sorted[value_per_key_stock_sorted$Cumulative_Percentage < 80, ])
# Print the result
print(num_rows)
[1] 800
Market Basket Analysis¶
Association Rules: Discover commonly co-purchased products by applying market basket analysis.
# Load required libraries
library(arules)
library(dplyr)
library(tidyr)
# Step 1: Group data by InvoiceNo and create a list of items purchased in each transaction
transactions <- data %>%
group_by(InvoiceNo) %>%
summarize(Items = list(StockCode))
# Step 2: Remove duplicated items in transactions
transactions$Items <- lapply(transactions$Items, unique)
# Step 3: Convert transactions into a transaction object
transactions <- as(transactions$Items, "transactions")
# Step 4: Mine frequent itemsets
frequent_itemsets <- apriori(transactions, parameter = list(support = 0.01))
# Step 5: Find association rules
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.5))
# Step 6: Sort and display the top 10 association rules by lift
rules_df <- as.data.frame(inspect(rules))
top_10_rules <- rules_df[order(-rules_df$lift), ][1:10, ]
# Display the top 10 association rules
print(top_10_rules)
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.8 0.1 1 none FALSE TRUE 5 0.01 1
maxlen target ext
10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 217
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3676 item(s), 21785 transaction(s)] done [0.25s].
sorting and recoding items ... [532 item(s)] done [0.01s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [10 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.5 0.1 1 none FALSE TRUE 5 0.01 1
maxlen target ext
10 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 217
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3676 item(s), 21785 transaction(s)] done [0.26s].
sorting and recoding items ... [532 item(s)] done [0.00s].
creating transaction tree ... done [0.02s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [196 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
lhs rhs support confidence lift count
[1] {21086} => {21094} 0.01083314 0.8280702 55.506181 236
[2] {21094} => {21086} 0.01083314 0.7261538 55.506181 236
[3] {84997C} => {84997D} 0.01051182 0.7387097 37.688034 229
[4] {84997D} => {84997C} 0.01051182 0.5362998 37.688034 229
[5] {23254} => {23256} 0.01005279 0.7711268 50.599387 219
[6] {23256} => {23254} 0.01005279 0.6596386 50.599387 219
[7] {23171} => {23170} 0.01064953 0.8436364 54.054759 232
[8] {23170} => {23171} 0.01064953 0.6823529 54.054759 232
[9] {47590B} => {47590A} 0.01175120 0.7052342 42.914878 256
[10] {47590A} => {47590B} 0.01175120 0.7150838 42.914878 256
[11] {22725} => {22727} 0.01055772 0.6865672 16.490480 230
[12] {21136} => {84879} 0.01175120 0.7231638 11.374819 256
[13] {22729} => {22726} 0.01028230 0.5957447 16.042395 224
[14] {22729} => {22727} 0.01165940 0.6755319 16.225428 254
[15] {23295} => {23293} 0.01133808 0.6785714 31.452508 247
[16] {23293} => {23295} 0.01133808 0.5255319 31.452508 247
[17] {22579} => {22578} 0.01042001 0.8194946 40.946536 227
[18] {22578} => {22579} 0.01042001 0.5206422 40.946536 227
[19] {21094} => {21080} 0.01051182 0.7046154 20.912869 229
[20] {82581} => {82580} 0.01042001 0.7394137 41.515791 227
[21] {82580} => {82581} 0.01042001 0.5850515 41.515791 227
[22] {23243} => {22720} 0.01009869 0.5250597 9.391153 220
[23] {22745} => {22748} 0.01170530 0.7993730 49.897827 255
[24] {22748} => {22745} 0.01170530 0.7306590 49.897827 255
[25] {22625} => {22624} 0.01239385 0.5793991 20.067107 270
[26] {23343} => {23344} 0.01133808 0.6301020 23.148015 247
[27] {22749} => {22750} 0.01092495 0.5935162 34.025660 238
[28] {22750} => {22749} 0.01092495 0.6263158 34.025660 238
[29] {22730} => {22726} 0.01266927 0.5750000 15.483776 276
[30] {22730} => {22727} 0.01459720 0.6625000 15.912417 318
[31] {84596F} => {84596B} 0.01037411 0.7820069 47.060831 226
[32] {84596B} => {84596F} 0.01037411 0.6243094 47.060831 226
[33] {22617} => {22138} 0.01432178 0.7107062 17.574045 312
[34] {23200} => {23199} 0.01450539 0.6638655 19.570110 316
[35] {23200} => {23202} 0.01106266 0.5063025 14.177121 241
[36] {23200} => {23203} 0.01248566 0.5714286 11.347832 272
[37] {22328} => {22326} 0.01225614 0.5644820 20.093531 267
[38] {82483} => {82486} 0.01294469 0.5136612 19.293292 282
[39] {21231} => {21232} 0.01188891 0.6348039 21.641946 259
[40] {21930} => {85099B} 0.01142988 0.5253165 6.965319 249
[41] {22385} => {85099B} 0.01234795 0.5592516 7.415274 269
[42] {22866} => {22867} 0.01106266 0.5073684 23.027127 241
[43] {22867} => {22866} 0.01106266 0.5020833 23.027127 241
[44] {22866} => {22865} 0.01289878 0.5915789 22.105570 281
[45] {20712} => {85099B} 0.01216433 0.5289421 7.013393 265
[46] {22114} => {22112} 0.01198072 0.5038610 15.525618 261
[47] {22698} => {22697} 0.02162038 0.7969543 23.685743 471
[48] {22697} => {22698} 0.02162038 0.6425648 23.685743 471
[49] {22698} => {22699} 0.02070232 0.7631134 19.885675 451
[50] {22699} => {22698} 0.02070232 0.5394737 19.885675 451
[51] {22698} => {22423} 0.01464310 0.5397631 6.244684 319
[52] {21928} => {21929} 0.01303649 0.5144928 18.254438 284
[53] {21928} => {85099B} 0.01491852 0.5887681 7.806642 325
[54] {22835} => {22112} 0.01331191 0.5686275 17.521286 290
[55] {22804} => {85123A} 0.01400046 0.7314149 7.888056 305
[56] {22578} => {22577} 0.01468901 0.7339450 34.384927 320
[57] {22577} => {22578} 0.01468901 0.6881720 34.384927 320
[58] {23322} => {23321} 0.01294469 0.5412668 20.122009 282
[59] {22867} => {22865} 0.01230204 0.5583333 20.863279 268
[60] {85099C} => {85099B} 0.01698416 0.5580694 7.399599 370
[61] {20719} => {20724} 0.01331191 0.5380334 15.524579 290
[62] {20971} => {20972} 0.01083314 0.5119306 20.767985 236
[63] {21791} => {21790} 0.01147579 0.5122951 15.393584 250
[64] {21914} => {21915} 0.01248566 0.5240848 20.065355 272
[65] {85099F} => {85099B} 0.01937113 0.6326837 8.388931 422
[66] {22356} => {20724} 0.01496443 0.6468254 18.663697 326
[67] {20723} => {20724} 0.01377094 0.5952381 17.175181 300
[68] {23208} => {23206} 0.01404636 0.5091514 12.633102 306
[69] {23208} => {23209} 0.01464310 0.5307820 11.369800 319
[70] {21929} => {85099B} 0.01537755 0.5456026 7.234299 335
[71] {22355} => {20724} 0.01381685 0.5041876 14.547983 301
[72] {22728} => {22726} 0.01579068 0.5468998 14.727087 344
[73] {22728} => {22727} 0.01849897 0.6406995 15.388797 403
[74] {22697} => {22699} 0.02556805 0.7598909 19.801701 557
[75] {22699} => {22697} 0.02556805 0.6662679 19.801701 557
[76] {22697} => {22423} 0.01758090 0.5225102 6.045080 383
[77] {22662} => {22382} 0.01634152 0.5884298 12.806136 356
[78] {22662} => {20725} 0.01418407 0.5107438 8.365830 309
[79] {82494L} => {82482} 0.02175809 0.5738499 13.983579 474
[80] {82482} => {82494L} 0.02175809 0.5302013 13.983579 474
[81] {22699} => {22423} 0.01983016 0.5167464 5.978396 432
[82] {23202} => {23203} 0.01973835 0.5526992 10.975891 430
[83] {22551} => {22554} 0.01349552 0.5008518 18.430838 294
[84] {22386} => {85099B} 0.02543034 0.6266968 8.309550 554
[85] {21931} => {85099B} 0.02033509 0.5600506 7.425868 443
[86] {22630} => {22629} 0.01946293 0.6784000 20.756944 424
[87] {22629} => {22630} 0.01946293 0.5955056 20.756944 424
[88] {21975} => {21212} 0.01230204 0.5037594 10.542170 268
[89] {22910} => {22086} 0.02084003 0.6403385 14.090681 454
[90] {22726} => {22727} 0.02460409 0.6625464 15.913531 536
[91] {22727} => {22726} 0.02460409 0.5909592 15.913531 536
[92] {21733} => {85123A} 0.02120725 0.6676301 7.200159 462
[93] {21977} => {21212} 0.01551526 0.5007407 10.478998 338
[94] {23300} => {23301} 0.02139087 0.7258567 20.616412 466
[95] {23301} => {23300} 0.02139087 0.6075619 20.616412 466
[96] {84991} => {21212} 0.01533165 0.5030120 10.526530 334
[97] {20726} => {20725} 0.02015148 0.5232420 8.570546 439
[98] {22384} => {20725} 0.02432867 0.5573081 9.128539 530
[99] {22697,22698} => {22699} 0.01822355 0.8428875 21.964478 397
[100] {22698,22699} => {22697} 0.01822355 0.8802661 26.161796 397
[101] {22697,22699} => {22698} 0.01822355 0.7127469 26.272742 397
[102] {22697,22698} => {22423} 0.01257746 0.5817410 6.730338 274
[103] {22423,22698} => {22697} 0.01257746 0.8589342 25.527805 274
[104] {22423,22697} => {22698} 0.01257746 0.7154047 26.370713 274
[105] {22698,22699} => {22423} 0.01243975 0.6008869 6.951844 271
[106] {22423,22698} => {22699} 0.01243975 0.8495298 22.137567 271
[107] {22423,22699} => {22698} 0.01243975 0.6273148 23.123610 271
[108] {22386,85099F} => {85099B} 0.01069543 0.7925170 10.508206 233
[109] {85099B,85099F} => {22386} 0.01069543 0.5521327 13.606573 233
[110] {22726,22728} => {22727} 0.01230204 0.7790698 18.712277 268
[111] {22727,22728} => {22726} 0.01230204 0.6650124 17.907658 268
[112] {22726,22727} => {22728} 0.01230204 0.5000000 17.317170 268
[113] {22697,22699} => {22423} 0.01450539 0.5673250 6.563555 316
[114] {22423,22697} => {22699} 0.01450539 0.8250653 21.500056 316
[115] {22423,22699} => {22697} 0.01450539 0.7314815 21.739869 316
[116] {23202,23203} => {85099B} 0.01046592 0.5302326 7.030503 228
[117] {23202,85099B} => {23203} 0.01046592 0.6570605 13.048371 228
[118] {21931,22386} => {85099B} 0.01023640 0.7852113 10.411337 223
[119] {21931,85099B} => {22386} 0.01023640 0.5033860 12.405276 223
[120] {23206,23209} => {20725} 0.01037411 0.5931759 9.716042 226
[121] {20725,23206} => {23209} 0.01037411 0.5765306 12.349773 226
[122] {20725,23209} => {23206} 0.01037411 0.5368171 13.319545 226
[123] {22383,23206} => {20725} 0.01009869 0.6077348 9.954513 220
[124] {20725,23206} => {22383} 0.01009869 0.5612245 11.501670 220
[125] {20726,22384} => {20725} 0.01101675 0.7430341 12.170674 240
[126] {20725,20726} => {22384} 0.01101675 0.5466970 12.523444 240
[127] {20726,20728} => {20725} 0.01078724 0.6368564 10.431516 235
[128] {20725,20726} => {20728} 0.01078724 0.5353075 11.791379 235
[129] {20725,20728} => {20726} 0.01078724 0.5042918 13.094157 235
[130] {20726,20727} => {20725} 0.01064953 0.6590909 10.795711 232
[131] {20725,20726} => {20727} 0.01064953 0.5284738 10.729545 232
[132] {20726,22382} => {22383} 0.01005279 0.5354523 10.973498 219
[133] {20726,22383} => {22382} 0.01005279 0.5983607 13.022265 219
[134] {20726,22382} => {20725} 0.01193482 0.6356968 10.412523 260
[135] {20725,20726} => {22382} 0.01193482 0.5922551 12.889389 260
[136] {20725,22382} => {20726} 0.01193482 0.5531915 14.363858 260
[137] {20726,22383} => {20725} 0.01184301 0.7049180 11.546345 258
[138] {20725,20726} => {22383} 0.01184301 0.5876993 12.044242 258
[139] {20728,22384} => {20727} 0.01198072 0.6041667 12.266329 261
[140] {20727,22384} => {20728} 0.01198072 0.5588865 12.310761 261
[141] {20727,20728} => {22384} 0.01198072 0.6141176 14.067879 261
[142] {20728,22384} => {22382} 0.01032821 0.5208333 11.335019 225
[143] {22382,22384} => {20728} 0.01032821 0.6081081 13.394980 225
[144] {20728,22382} => {22384} 0.01032821 0.5696203 13.048556 225
[145] {20728,22384} => {22383} 0.01074134 0.5416667 11.100854 234
[146] {22383,22384} => {20728} 0.01074134 0.5792079 12.758387 234
[147] {20728,22383} => {22384} 0.01074134 0.5120350 11.729425 234
[148] {20728,22384} => {20725} 0.01294469 0.6527778 10.692304 282
[149] {20725,22384} => {20728} 0.01294469 0.5320755 11.720186 282
[150] {20725,20728} => {22384} 0.01294469 0.6051502 13.862458 282
[151] {22382,22384} => {20727} 0.01037411 0.6108108 12.401224 226
[152] {20727,22382} => {22384} 0.01037411 0.5393795 12.355817 226
[153] {20727,22384} => {22383} 0.01120037 0.5224839 10.707726 244
[154] {22383,22384} => {20727} 0.01120037 0.6039604 12.262141 244
[155] {20727,22383} => {22384} 0.01120037 0.5224839 11.968783 244
[156] {20727,22384} => {20725} 0.01418407 0.6616702 10.837959 309
[157] {20725,22384} => {20727} 0.01418407 0.5830189 11.836967 309
[158] {20725,20727} => {22384} 0.01418407 0.5908222 13.534239 309
[159] {22382,22384} => {20725} 0.01161350 0.6837838 11.200173 253
[160] {20725,22382} => {22384} 0.01161350 0.5382979 12.331040 253
[161] {22383,22384} => {20725} 0.01299059 0.7004950 11.473898 283
[162] {20725,22384} => {22383} 0.01299059 0.5339623 10.942961 283
[163] {20725,22383} => {22384} 0.01299059 0.5380228 12.324739 283
[164] {20727,20728} => {22382} 0.01028230 0.5270588 11.470506 224
[165] {20728,22382} => {20727} 0.01028230 0.5670886 11.513537 224
[166] {20727,22382} => {20728} 0.01028230 0.5346062 11.775931 224
[167] {20727,20728} => {22383} 0.01156759 0.5929412 12.151668 252
[168] {20728,22383} => {20727} 0.01156759 0.5514223 11.195466 252
[169] {20727,22383} => {20728} 0.01156759 0.5396146 11.886252 252
[170] {20727,20728} => {20725} 0.01207253 0.6188235 10.136143 263
[171] {20725,20728} => {20727} 0.01207253 0.5643777 11.458497 263
[172] {20725,20727} => {20728} 0.01207253 0.5028681 11.076826 263
[173] {20728,22382} => {22383} 0.01106266 0.6101266 12.503864 241
[174] {20728,22383} => {22382} 0.01106266 0.5273523 11.476893 241
[175] {22382,22383} => {20728} 0.01106266 0.5343681 11.770686 241
[176] {20728,22382} => {20725} 0.01087905 0.6000000 9.827820 237
[177] {20725,20728} => {22382} 0.01087905 0.5085837 11.068427 237
[178] {20725,22382} => {20728} 0.01087905 0.5042553 11.107383 237
[179] {20728,22383} => {20725} 0.01276107 0.6083151 9.964018 278
[180] {20725,20728} => {22383} 0.01276107 0.5965665 12.225966 278
[181] {20725,22383} => {20728} 0.01276107 0.5285171 11.641805 278
[182] {20727,22382} => {22383} 0.01156759 0.6014320 12.325678 252
[183] {20727,22383} => {22382} 0.01156759 0.5396146 11.743759 252
[184] {22382,22383} => {20727} 0.01156759 0.5587583 11.344408 252
[185] {20727,22382} => {20725} 0.01179711 0.6133652 10.046737 257
[186] {20725,22382} => {20727} 0.01179711 0.5468085 11.101793 257
[187] {20727,22383} => {20725} 0.01294469 0.6038544 9.890953 282
[188] {20725,20727} => {22383} 0.01294469 0.5391969 11.050240 282
[189] {20725,22383} => {20727} 0.01294469 0.5361217 10.884819 282
[190] {22382,22383} => {20725} 0.01253156 0.6053215 9.914984 273
[191] {20725,22382} => {22383} 0.01253156 0.5808511 11.903895 273
[192] {20725,22383} => {22382} 0.01253156 0.5190114 11.295368 273
[193] {22697,22698,22699} => {22423} 0.01106266 0.6070529 7.023180 241
[194] {22423,22697,22698} => {22699} 0.01106266 0.8795620 22.920166 241
[195] {22423,22698,22699} => {22697} 0.01106266 0.8892989 26.430254 241
[196] {22423,22697,22699} => {22698} 0.01106266 0.7626582 28.112537 241
lhs rhs support confidence lift count
[1] {21086} => {21094} 0.01083314 0.8280702 55.50618 236
[2] {21094} => {21086} 0.01083314 0.7261538 55.50618 236
[7] {23171} => {23170} 0.01064953 0.8436364 54.05476 232
[8] {23170} => {23171} 0.01064953 0.6823529 54.05476 232
[5] {23254} => {23256} 0.01005279 0.7711268 50.59939 219
[6] {23256} => {23254} 0.01005279 0.6596386 50.59939 219
[23] {22745} => {22748} 0.01170530 0.7993730 49.89783 255
[24] {22748} => {22745} 0.01170530 0.7306590 49.89783 255
[31] {84596F} => {84596B} 0.01037411 0.7820069 47.06083 226
[32] {84596B} => {84596F} 0.01037411 0.6243094 47.06083 226
rules_df
| lhs | rhs | support | confidence | lift | count | ||
|---|---|---|---|---|---|---|---|
| [1] | {21086} | => | {21094} | 0.01083314 | 0.8280702 | 55.506181 | 236 |
| [2] | {21094} | => | {21086} | 0.01083314 | 0.7261538 | 55.506181 | 236 |
| [3] | {84997C} | => | {84997D} | 0.01051182 | 0.7387097 | 37.688034 | 229 |
| [4] | {84997D} | => | {84997C} | 0.01051182 | 0.5362998 | 37.688034 | 229 |
| [5] | {23254} | => | {23256} | 0.01005279 | 0.7711268 | 50.599387 | 219 |
| [6] | {23256} | => | {23254} | 0.01005279 | 0.6596386 | 50.599387 | 219 |
| [7] | {23171} | => | {23170} | 0.01064953 | 0.8436364 | 54.054759 | 232 |
| [8] | {23170} | => | {23171} | 0.01064953 | 0.6823529 | 54.054759 | 232 |
| [9] | {47590B} | => | {47590A} | 0.01175120 | 0.7052342 | 42.914878 | 256 |
| [10] | {47590A} | => | {47590B} | 0.01175120 | 0.7150838 | 42.914878 | 256 |
| [11] | {22725} | => | {22727} | 0.01055772 | 0.6865672 | 16.490480 | 230 |
| [12] | {21136} | => | {84879} | 0.01175120 | 0.7231638 | 11.374819 | 256 |
| [13] | {22729} | => | {22726} | 0.01028230 | 0.5957447 | 16.042395 | 224 |
| [14] | {22729} | => | {22727} | 0.01165940 | 0.6755319 | 16.225428 | 254 |
| [15] | {23295} | => | {23293} | 0.01133808 | 0.6785714 | 31.452508 | 247 |
| [16] | {23293} | => | {23295} | 0.01133808 | 0.5255319 | 31.452508 | 247 |
| [17] | {22579} | => | {22578} | 0.01042001 | 0.8194946 | 40.946536 | 227 |
| [18] | {22578} | => | {22579} | 0.01042001 | 0.5206422 | 40.946536 | 227 |
| [19] | {21094} | => | {21080} | 0.01051182 | 0.7046154 | 20.912869 | 229 |
| [20] | {82581} | => | {82580} | 0.01042001 | 0.7394137 | 41.515791 | 227 |
| [21] | {82580} | => | {82581} | 0.01042001 | 0.5850515 | 41.515791 | 227 |
| [22] | {23243} | => | {22720} | 0.01009869 | 0.5250597 | 9.391153 | 220 |
| [23] | {22745} | => | {22748} | 0.01170530 | 0.7993730 | 49.897827 | 255 |
| [24] | {22748} | => | {22745} | 0.01170530 | 0.7306590 | 49.897827 | 255 |
| [25] | {22625} | => | {22624} | 0.01239385 | 0.5793991 | 20.067107 | 270 |
| [26] | {23343} | => | {23344} | 0.01133808 | 0.6301020 | 23.148015 | 247 |
| [27] | {22749} | => | {22750} | 0.01092495 | 0.5935162 | 34.025660 | 238 |
| [28] | {22750} | => | {22749} | 0.01092495 | 0.6263158 | 34.025660 | 238 |
| [29] | {22730} | => | {22726} | 0.01266927 | 0.5750000 | 15.483776 | 276 |
| [30] | {22730} | => | {22727} | 0.01459720 | 0.6625000 | 15.912417 | 318 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| [167] | {20727,20728} | => | {22383} | 0.01156759 | 0.5929412 | 12.151668 | 252 |
| [168] | {20728,22383} | => | {20727} | 0.01156759 | 0.5514223 | 11.195466 | 252 |
| [169] | {20727,22383} | => | {20728} | 0.01156759 | 0.5396146 | 11.886252 | 252 |
| [170] | {20727,20728} | => | {20725} | 0.01207253 | 0.6188235 | 10.136143 | 263 |
| [171] | {20725,20728} | => | {20727} | 0.01207253 | 0.5643777 | 11.458497 | 263 |
| [172] | {20725,20727} | => | {20728} | 0.01207253 | 0.5028681 | 11.076826 | 263 |
| [173] | {20728,22382} | => | {22383} | 0.01106266 | 0.6101266 | 12.503864 | 241 |
| [174] | {20728,22383} | => | {22382} | 0.01106266 | 0.5273523 | 11.476893 | 241 |
| [175] | {22382,22383} | => | {20728} | 0.01106266 | 0.5343681 | 11.770686 | 241 |
| [176] | {20728,22382} | => | {20725} | 0.01087905 | 0.6000000 | 9.827820 | 237 |
| [177] | {20725,20728} | => | {22382} | 0.01087905 | 0.5085837 | 11.068427 | 237 |
| [178] | {20725,22382} | => | {20728} | 0.01087905 | 0.5042553 | 11.107383 | 237 |
| [179] | {20728,22383} | => | {20725} | 0.01276107 | 0.6083151 | 9.964018 | 278 |
| [180] | {20725,20728} | => | {22383} | 0.01276107 | 0.5965665 | 12.225966 | 278 |
| [181] | {20725,22383} | => | {20728} | 0.01276107 | 0.5285171 | 11.641805 | 278 |
| [182] | {20727,22382} | => | {22383} | 0.01156759 | 0.6014320 | 12.325678 | 252 |
| [183] | {20727,22383} | => | {22382} | 0.01156759 | 0.5396146 | 11.743759 | 252 |
| [184] | {22382,22383} | => | {20727} | 0.01156759 | 0.5587583 | 11.344408 | 252 |
| [185] | {20727,22382} | => | {20725} | 0.01179711 | 0.6133652 | 10.046737 | 257 |
| [186] | {20725,22382} | => | {20727} | 0.01179711 | 0.5468085 | 11.101793 | 257 |
| [187] | {20727,22383} | => | {20725} | 0.01294469 | 0.6038544 | 9.890953 | 282 |
| [188] | {20725,20727} | => | {22383} | 0.01294469 | 0.5391969 | 11.050240 | 282 |
| [189] | {20725,22383} | => | {20727} | 0.01294469 | 0.5361217 | 10.884819 | 282 |
| [190] | {22382,22383} | => | {20725} | 0.01253156 | 0.6053215 | 9.914984 | 273 |
| [191] | {20725,22382} | => | {22383} | 0.01253156 | 0.5808511 | 11.903895 | 273 |
| [192] | {20725,22383} | => | {22382} | 0.01253156 | 0.5190114 | 11.295368 | 273 |
| [193] | {22697,22698,22699} | => | {22423} | 0.01106266 | 0.6070529 | 7.023180 | 241 |
| [194] | {22423,22697,22698} | => | {22699} | 0.01106266 | 0.8795620 | 22.920166 | 241 |
| [195] | {22423,22698,22699} | => | {22697} | 0.01106266 | 0.8892989 | 26.430254 | 241 |
| [196] | {22423,22697,22699} | => | {22698} | 0.01106266 | 0.7626582 | 28.112537 | 241 |
- Grouping data by InvoiceNo column to create transaction lists.
- Removing duplicated items within transactions to ensure uniqueness.
- Converting transaction lists into objects required for the Apriori algorithm.
- Mining frequent itemsets with a support threshold of 0.01 and deriving association rules with a confidence threshold of 0.5, then sorting and displaying the top 10 rules by lift.
top_10_rules
| lhs | rhs | support | confidence | lift | count | ||
|---|---|---|---|---|---|---|---|
| [1] | {21086} | => | {21094} | 0.01083314 | 0.8280702 | 55.50618 | 236 |
| [2] | {21094} | => | {21086} | 0.01083314 | 0.7261538 | 55.50618 | 236 |
| [7] | {23171} | => | {23170} | 0.01064953 | 0.8436364 | 54.05476 | 232 |
| [8] | {23170} | => | {23171} | 0.01064953 | 0.6823529 | 54.05476 | 232 |
| [5] | {23254} | => | {23256} | 0.01005279 | 0.7711268 | 50.59939 | 219 |
| [6] | {23256} | => | {23254} | 0.01005279 | 0.6596386 | 50.59939 | 219 |
| [23] | {22745} | => | {22748} | 0.01170530 | 0.7993730 | 49.89783 | 255 |
| [24] | {22748} | => | {22745} | 0.01170530 | 0.7306590 | 49.89783 | 255 |
| [31] | {84596F} | => | {84596B} | 0.01037411 | 0.7820069 | 47.06083 | 226 |
| [32] | {84596B} | => | {84596F} | 0.01037411 | 0.6243094 | 47.06083 | 226 |
- The lhs column indicates the antecedent of association rules, representing items found together in the dataset.
- The rhs column denotes the consequent of association rules, signifying items often purchased alongside those on the lhs.
- Support measures the frequency of occurrence, confidence quantifies rule strength, and lift assesses the degree of association between lhs and rhs items.side.
filtered_data <- subset(data, StockCode == "22748")
filtered_data$Description[1]
filtered_data <- subset(data, StockCode == "22745")
filtered_data$Description[1]
Example:¶
{22745} => {22748} support: 0.01170530 confidence: 0.7993730 lift: 49.89783 count: 255¶
Explanation:¶
lhs: {22745} (left-hand side) indicates the item "22745"¶
¶
rhs: {22748} (right-hand side) indicates the item "2274 " .#### support: 0.01170530 means that the combination of item "22745" and item "22748" occurs in approximately 1.17% of all transacto ns
.¶
confidence: 0.7993730 means that in 79.94% of transactions containing item "22745", item "22748" is also pes e n#### t. lift: 49.89783 indicates that the likelihood of item "22748" being purchased when item "22745" is also purchased is approximately 49.90 times higher than if they were purchased indepnde n t#### ly. count: 255 represents the absolute count of transactions where both items "22745" and "22748" are pr.
RMF Analysis¶
RFM Analysis is a customer segmentation technique that evaluates Recency, Frequency, and Monetary value to identify high-value customers. Recency indicates responsiveness to promotions, Frequency reflects engagement and loyalty, and Monetary value distinguishes between heavy spenders and low-value purchasers. By analyzing these metrics, marketers can tailor communications to specific customer segments, enhancing sales opportunities and customer retention.
Here's a breakdown of each component:
Recency (R) measures how recently a customer made a purchase, with higher scores indicating more responsiveness to promotions. Frequency (F) gauges how often a customer purchases, indicating engagement and loyalty. Monetary (M) reflects the amount a customer spends, distinguishing between heavy spenders and low-value purchasers.
# Calculate Monetary value for each transaction if not already included
data$Monetary <- data$Quantity * data$UnitPrice
# Ensure InvoiceDate is a datetime type
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%Y-%m-%d %H:%M:%S")
# Calculate Recency in days
current_date <- as.Date(max(data$InvoiceDate)) + 1
data$Recency <- as.numeric(difftime(current_date, data$InvoiceDate, units = "days"))
# Group by CustomerID to calculate RFM values per customer
rfm_df <- data %>%
group_by(CustomerID) %>%
summarize(
Recency = min(Recency), # Most recent purchase date for each customer
Frequency = n_distinct(InvoiceNo), # Number of unique transactions (counts of invoices) for frequency
Monetary = sum(Monetary) # Total sum for monetary value
)
# Inspecting the first few rows of the RFM dataframe
head(rfm_df)
| CustomerID | Recency | Frequency | Monetary |
|---|---|---|---|
| 12346 | 326 | 2 | 0.00 |
| 12347 | 3 | 7 | 4310.00 |
| 12348 | 76 | 4 | 1437.24 |
| 12349 | 19 | 1 | 1457.55 |
| 12350 | 311 | 1 | 294.40 |
| 12352 | 37 | 8 | 1265.41 |
Recency: Lower values signify more recent purchases, with customer 12347 being the most recent (Recency = 3) and customer 12350 the least recent (Recency = 311).
Frequency: Higher values indicate more purchases, with customer 12352 having the highest frequency (Frequency = 8) and customers 12349 and 12350 the lowest (Frequency = 1).
Monetary: Higher values denote higher spending, with customer 12347 having the highest monetary value ($4310.00) and customer 12346 the lowest ($0.00).
Customer Segmentation - K means Clustering¶
- Customer segmentation enhances marketing by tailoring experiences and campaigns.
- Relevant attributes are vital for effective segmentation strategies.
- K-Means clustering is a key tool for grouping customers based on similarities.
unique_customers <- data %>%
distinct(CustomerID) %>%
nrow()
print(unique_customers)
[1] 4362
dim(rfm_df)
- 4362
- 4
head(rfm_df)
| CustomerID | Recency | Frequency | Monetary |
|---|---|---|---|
| 12346 | 326 | 2 | 0.00 |
| 12347 | 3 | 7 | 4310.00 |
| 12348 | 76 | 4 | 1437.24 |
| 12349 | 19 | 1 | 1457.55 |
| 12350 | 311 | 1 | 294.40 |
| 12352 | 37 | 8 | 1265.41 |
# Boxplot for Recency
ggplot(data = rfm_df, aes(x = "", y = Recency)) +
geom_boxplot() +
labs(title = "Boxplot of Recency")
# Boxplot for Frequency
ggplot(data = rfm_df, aes(x = "", y = Frequency)) +
geom_boxplot() +
labs(title = "Boxplot of Frequency")
# Boxplot for Monetary
ggplot(data = rfm_df, aes(x = "", y = Monetary)) +
geom_boxplot() +
labs(title = "Boxplot of Monetary")
# Remove the CustomerID column
new_df <- rfm_df[, c("Recency", "Frequency", "Monetary")]
# Remove outliers
new_df <- new_df[!apply(new_df, 1, function(x) any(abs(scale(x)) > 3)), ]
head(new_df)
| Recency | Frequency | Monetary |
|---|---|---|
| 326 | 2 | 0.00 |
| 3 | 7 | 4310.00 |
| 76 | 4 | 1437.24 |
| 19 | 1 | 1457.55 |
| 311 | 1 | 294.40 |
| 37 | 8 | 1265.41 |
# Select the columns
col_names <- c("Recency", "Frequency", "Monetary")
features <- new_df[, col_names]
# Scale the features
scaled_features <- scale(features)
# Convert the scaled features back to a dataframe
scaled_features <- as.data.frame(scaled_features)
# Rename the columns
colnames(scaled_features) <- col_names
head(scaled_features)
| Recency | Frequency | Monetary |
|---|---|---|
| 2.3105470 | -0.3296525 | -0.22923618 |
| -0.8882559 | 0.2208200 | 0.29217240 |
| -0.1653067 | -0.1094635 | -0.05536396 |
| -0.7298013 | -0.4397470 | -0.05290693 |
| 2.1619958 | -0.4397470 | -0.19362071 |
| -0.5515398 | 0.3309144 | -0.07615135 |
# Vector to store SSE values
SSE <- numeric(0)
# Loop through different numbers of clusters
for (cluster in 1:9) {
# Fit KMeans model
kmeans_model <- kmeans(scaled_features, centers = cluster, nstart = 10)
# Calculate SSE and store it
SSE <- c(SSE, kmeans_model$tot.withinss)
}
# Create a dataframe with Cluster and SSE values
frame <- data.frame(Cluster = 1:9, SSE = SSE)
# Plot SSE against number of clusters
plot(frame$Cluster, frame$SSE, type = "b", pch = 19, xlab = "Number of clusters", ylab = "Inertia")
k <- 4 # Number of clusters
# Fit the KMeans model
kmeans_model <- kmeans(scaled_features, centers = k, nstart = 25)
# Get cluster labels
cluster_labels <- kmeans_model$cluster
# Combine cluster labels with the original dataframe
rfm_df$Cluster <- cluster_labels
print(kmeans_model$centers)
Recency Frequency Monetary 1 -0.7992213 2.43874285 1.18815081 2 -0.8648478 11.25028573 14.82165577 3 1.5570307 -0.35259309 -0.17376008 4 -0.4884870 -0.08048749 -0.07198866
head(rfm_df)
| CustomerID | Recency | Frequency | Monetary | Cluster |
|---|---|---|---|---|
| 12346 | 326 | 2 | 0.00 | 3 |
| 12347 | 3 | 7 | 4310.00 | 4 |
| 12348 | 76 | 4 | 1437.24 | 4 |
| 12349 | 19 | 1 | 1457.55 | 4 |
| 12350 | 311 | 1 | 294.40 | 3 |
| 12352 | 37 | 8 | 1265.41 | 4 |
library(dplyr)
library(ggplot2)
# Group by cluster and calculate mean values for each feature
avg_df <- rfm_df %>%
group_by(Cluster) %>%
summarise(
Recency = mean(Recency),
Frequency = mean(Frequency),
Monetary = mean(Monetary)
)
avg_df
| Cluster | Recency | Frequency | Monetary |
|---|---|---|---|
| 1 | 11.990291 | 27.145631 | 11716.2205 |
| 2 | 5.363636 | 107.181818 | 124411.7309 |
| 3 | 249.913488 | 1.791628 | 458.5693 |
| 4 | 43.366775 | 4.263192 | 1299.8190 |
ggplot(avg_df, aes(x = as.factor(Cluster), y = Recency, fill = as.factor(Cluster))) +
geom_bar(stat = "identity", color = "black") +
labs(x = "Cluster", y = "Recency", title = "Average Recency by Cluster") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(avg_df, aes(x = as.factor(Cluster), y = Frequency, fill = as.factor(Cluster))) +
geom_bar(stat = "identity", color = "black") +
labs(x = "Cluster", y = "Frequency", title = "Average Frequency by Cluster") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(avg_df, aes(x = as.factor(Cluster), y = Monetary, fill = as.factor(Cluster))) +
geom_bar(stat = "identity", color = "black") +
labs(x = "Cluster", y = "Monetary", title = "Average Monetary by Cluster") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(rfm_df, aes(x = Recency, y = Frequency, z = Monetary, color = as.factor(Cluster))) +
geom_point() +
labs(title = "Scatter Plot of RFM Clusters (Individual)",
x = "Recency", y = "Frequency", z = "Monetary") +
theme_minimal()
head(rfm_df)
| CustomerID | Recency | Frequency | Monetary | Cluster |
|---|---|---|---|---|
| 12346 | 326 | 2 | 0.00 | 3 |
| 12347 | 3 | 7 | 4310.00 | 4 |
| 12348 | 76 | 4 | 1437.24 | 4 |
| 12349 | 19 | 1 | 1457.55 | 4 |
| 12350 | 311 | 1 | 294.40 | 3 |
| 12352 | 37 | 8 | 1265.41 | 4 |
# Create scatter plot
ggplot(rfm_df, aes(x = Frequency, y = Monetary, color = as.factor(Cluster))) +
geom_point() +
labs(title = "Segmentation K-means",
x = "Frequency", y = "Monetary") +
scale_color_manual(values = c("green", "red", "cyan", "magenta")) +
theme_minimal()
dim(rfm_df)
- 4362
- 6
# Calculate the first and third quartiles
Q1 <- quantile(data$Quantity, 0.05)
Q3 <- quantile(data$Quantity, 0.95)
# Calculate the IQR
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter the data to remove outliers
filtered_data <- subset(data, Quantity >= lower_bound & Quantity <= upper_bound)
# Calculate the first and third quartiles
Q1 <- quantile(filtered_data$UnitPrice, 0.05)
Q3 <- quantile(filtered_data$UnitPrice, 0.95)
# Calculate the IQR
IQR <- Q3 - Q1
# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter the data to remove outliers
filtered_data <- subset(filtered_data, UnitPrice >= lower_bound & UnitPrice <= upper_bound)
head(filtered_data)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | is_Cancelled | len_StockCode | Sales | Monetary | Recency |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-11-30 18:00:00 | 2.55 | 17850 | United Kingdom | FALSE | 6 | 15.30 | 15.30 | 374 |
| 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-11-30 18:00:00 | 3.39 | 17850 | United Kingdom | FALSE | 5 | 20.34 | 20.34 | 374 |
| 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-11-30 18:00:00 | 2.75 | 17850 | United Kingdom | FALSE | 6 | 22.00 | 22.00 | 374 |
| 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-11-30 18:00:00 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 | 20.34 | 374 |
| 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-11-30 18:00:00 | 3.39 | 17850 | United Kingdom | FALSE | 6 | 20.34 | 20.34 | 374 |
| 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-11-30 18:00:00 | 7.65 | 17850 | United Kingdom | FALSE | 5 | 15.30 | 15.30 | 374 |
dim(filtered_data)
- 390346
- 13
# Distribution Analysis
print("\nDistribution Analysis:")
# Quantity Distribution
ggplot(filtered_data, aes(x = Quantity)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Quantity Distribution", x = "Quantity", y = "Frequency") +
theme_minimal()
[1] "\nDistribution Analysis:"
ggplot(filtered_data, aes(x = UnitPrice)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "UnitPrice Distribution", x = "UnitPrice", y = "Frequency") +
theme_minimal()
print("\nTime Series Analysis:")
# Resample to monthly frequency
filtered_data$InvoiceDate <- as.Date(filtered_data$InvoiceDate)
monthly_sales <- filtered_data %>%
group_by(month = floor_date(InvoiceDate, "month")) %>%
summarize(total_quantity = sum(Quantity))
[1] "\nTime Series Analysis:"
ggplot(monthly_sales, aes(x = month, y = total_quantity)) +
geom_line() +
labs(title = "Monthly Sales Volume", x = "Month", y = "Total Quantity Sold") +
theme_minimal()
# Customer Analysis
print("\nCustomer Analysis:")
# Top 10 customers by spend
top_customers <- filtered_data %>%
group_by(CustomerID) %>%
summarize(total_spend = sum(Quantity * UnitPrice)) %>%
arrange(desc(total_spend)) %>%
top_n(10)
[1] "\nCustomer Analysis:"
Selecting by total_spend
ggplot(top_customers, aes(x = reorder(CustomerID, -total_spend), y = total_spend)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Customers by Spend", x = "Customer ID", y = "Total Spend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print("\nProduct Analysis:")
# Top 10 products by quantity sold
top_products <- filtered_data %>%
group_by(Description) %>%
summarize(total_quantity = sum(Quantity)) %>%
arrange(desc(total_quantity)) %>%
top_n(10)
ggplot(top_products, aes(x = reorder(Description, -total_quantity), y = total_quantity)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 10 Products by Quantity Sold", x = "Product Description", y = "Quantity Sold") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
[1] "\nProduct Analysis:"
Selecting by total_quantity
# Geographical Insights
print("\nGeographical Insights:")
# Sales by country
sales_by_country <- filtered_data %>%
group_by(Country) %>%
summarize(total_spend = sum(Quantity * UnitPrice)) %>%
arrange(desc(total_spend))
ggplot(sales_by_country, aes(x = reorder(Country, -total_spend), y = total_spend)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Sales by Country", x = "Country", y = "Total Spend") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
[1] "\nGeographical Insights:"
# Sales Performance
print("\nSales Performance:")
# Monthly revenue
monthly_revenue <- filtered_data %>%
group_by(month = floor_date(InvoiceDate, "month")) %>%
summarize(total_revenue = sum(Quantity * UnitPrice))
ggplot(monthly_revenue, aes(x = month, y = total_revenue)) +
geom_line() +
labs(title = "Monthly Revenue", x = "Month", y = "Total Revenue") +
theme_minimal()
[1] "\nSales Performance:"
# Order Analysis
print("\nOrder Analysis:")
# Number of items per order
items_per_order <- filtered_data %>%
group_by(InvoiceNo) %>%
summarize(total_quantity = sum(Quantity))
ggplot(items_per_order, aes(x = total_quantity)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Items per Order", x = "Items per Order", y = "Frequency") +
theme_minimal()
[1] "\nOrder Analysis:"